The idea is to use synthetic data to investigate some aspects of model flexibility and how it impacts on bias and variance.
library(tidyverse)
library(tidymodels)
library(gganimate)
library(transformr)
library(kableExtra)
We generate a sample with a given \(\texttt{sample_size}\), the true \(f(X)=1- 2*x - 2*x^2 - x^4 - .1*x^7\) and \(\epsilon\sim N(0,true_sigma^{2})\)
# true_f = function(x){return(((x^{4})/30)-(1/6)*x^{3}-(7/30)*x^{2}+(11/15)*x-2)}
true_f = function(x){return(2*x+6*x^2+2*x^3)}
true_sigma=5
xmin=-3
xmax=2
sample_size = 200
synth_data = tibble(x = runif(sample_size,min=xmin,max=xmax),
fx= true_f(x),
noise=rnorm(sample_size,mean = 0,sd=true_sigma),
y=fx+noise)
synth_plot = synth_data %>%
ggplot(aes(x=x,y=y))+ theme_minimal() + geom_point(alpha=.1)
synth_plot
Even for a given data set, flexible methods may lead to different \(\hat{f}(X)\) due to the training/testing split.
n_samples = 99
train_prop = .25
synth_samples = tibble(data=rerun(.n = n_samples, synth_data %>% slice_sample(prop = train_prop)),
sample_id = c(paste0("sample: 0",1:9),paste0("sample: ",10:n_samples)),
)
To visualize this, consider 99 random splits, with a small proportion of training observations.
sample_movie = synth_plot +
geom_point(data=synth_samples %>% unnest(data),aes(x,y),colour="dodgerblue",alpha=.5) +
labs(title = '{closest_state}')+
transition_states(sample_id,state_length = 1)
anim_save(filename = "pop_sample_movie.gif",animation = sample_movie,path = "figures/",nframes = 200)
Fitting a linear regression, and a polynomial regression of degree 8 on a small training set will results in variable fitted curves. To visulize that…
linear_reg_movie = synth_plot +
geom_point(data=synth_samples %>% unnest(data),aes(x,y),colour="dodgerblue",alpha=.5) +
stat_smooth(data=synth_samples %>% unnest(data), aes(x,y), method = "lm", se = FALSE, color="indianred", geom='line',size=2) +
labs(title = 'linear reg. {closest_state}',subtitle = paste0("proportion of training obs: ", train_prop))+
transition_states(sample_id, wrap = TRUE,state_length = 1) +
shadow_trail(alpha = 0.35, exclude_layer = 2,color="forestgreen",size=.5)
anim_save(filename = "linear_regression_movie.gif",animation = linear_reg_movie,path = "figures/",nframes = 200)
poly8_movie = synth_plot +
geom_point(data=synth_samples %>% unnest(data),aes(x,y),colour="dodgerblue",alpha=.5) +
stat_smooth(data=synth_samples %>% unnest(data), aes(x,y), method="lm",formula = y~poly(x,8) , se = FALSE, color="indianred", geom='line',size=2) +
labs(title = 'polynomial reg. degree 8 {closest_state}',subtitle = paste0("proportion of training obs: ", train_prop))+
transition_states(sample_id, wrap = TRUE,state_length = 1) +
shadow_trail(alpha = 0.35, exclude_layer = 2,color="forestgreen",size=.5)
anim_save(filename = "poly8_movie.gif",animation = poly8_movie,path = "figures/",nframes = 200)
By increasing the training set size, the variability of the fitted curves will decrease.
Generating data can be used to compute the bias viariance trade-off.
The expected test MSE at a specific test observation \(x_{0}\) is the sum of three components:
\[E \left[\left( y_{0} - \hat{f}(x_{0})\right)^{2}\right] = var(\hat{f}(x_{0}) + Bias\left[\left(\hat{f}(x_{0})\right)\right]^{2}+var(\epsilon)\]
Non-flexible methods have low variance, but they can have an high bias: increasing the flexibility will reduce the bias, at the cost of increased variance. Now, since \(X\) is non stochastic, fix its values \(X=\bf x\), and generate 1000 sample of size 100 from \(Y|X={\bf x}\): since \(f({\bf x})=5{\bf x}-6{\bf x}^{2}+2{\bf x}^{3}\) is known, to generate a new sample it just takes to add the Gaussian noise \(\epsilon\) to \(f({\bf x})\), where \(\epsilon\sim N(0,\sigma^{2}=true_sigma^{2})\).
Set the size of the samples, and the number of generated samples, then create a nested tibble of samples
set.seed(123)
sample_size=100
n_samples=1000
synth_ys = tibble(x = rep(runif(sample_size,min=0,max=2),n_samples)) %>%
mutate(sample_id = rep(c(paste0("sample: 0",1:9),paste0("sample: ",10:n_samples)),each=sample_size),
fx= true_f(x),
noise=rnorm(n=n(),mean = 0,sd=true_sigma),
y=fx+noise) %>%
dplyr::select(sample_id,everything()) %>%
group_by(sample_id) %>% nest() %>% ungroup()
synth_ys %>% slice(1:4)
## # A tibble: 4 × 2
## sample_id data
## <chr> <list>
## 1 sample: 01 <tibble [100 × 4]>
## 2 sample: 02 <tibble [100 × 4]>
## 3 sample: 03 <tibble [100 × 4]>
## 4 sample: 04 <tibble [100 × 4]>
To have a look at the nested tibbles, unnest \(\texttt{data}\), these are the training samples
synth_ys %>% slice(1) %>% unnest(data) %>%
slice(1:6) %>% dplyr::select(-sample_id) %>%
kbl() %>% kable_styling(bootstrap_options = "hover",full_width = FALSE)
| x | fx | noise | y |
|---|---|---|---|
| 0.5751550 | 3.5156564 | 1.2665926 | 4.782249 |
| 1.5766103 | 25.9053804 | -0.1427338 | 25.762647 |
| 0.8179538 | 6.7447002 | -0.2143523 | 6.530348 |
| 1.7660348 | 33.2614408 | 6.8430114 | 40.104452 |
| 1.8809346 | 38.2985309 | -1.1288549 | 37.169676 |
| 0.0911130 | 0.2335482 | 7.5823530 | 7.815901 |
Generate now a small set of test observations \(x_{0}\). And then add random noise to generate \(y_{0}\)’s.
n_test=10
test_xs = runif(n_test,min = xmin, max = xmax)
test_obs = tibble(sample_id = c(paste0("sample: 0",1:9),paste0("sample: ",10:n_samples))) %>%
mutate(test_data=rerun(.n = n_samples,tibble(x=test_xs) %>%
mutate(fx=true_f(x),
noise = rnorm(n=n_test,sd=true_sigma),
y = noise+fx
)
)
) %>% unnest() %>% group_by(sample_id) %>%
mutate(test_ob_id=1:n()) %>% nest() %>%
rename(test_data=data)
The next step is to merge the test observations in the training sample data structure.
synth_ys = synth_ys %>% inner_join(test_obs,by="sample_id")
Then a linear regression is fit on each training sample, and for each fitted function \(\hat{f}\), predictions are made for the test observations.
sample_results = synth_ys %>% mutate(
model = map(.x = data,.f=~lm(data=.x, formula = y~x)),
model_preds= map2(.x = model,.y=test_data,.f=~augment(.x, newdata = .y))
)
It is now possible to compute all the quantities: bias, variance, the irreducible error and the expected test mse computed both as whole, and as the sum of the three quantities.
sample_selected_quantities = sample_results %>% unnest(cols = model_preds) %>%
dplyr::select(sample_id,x:.fitted)
test_elements = sample_selected_quantities %>% arrange(test_ob_id) %>% group_by(test_ob_id) %>%
summarise(exp_test_mse = mean((.fitted-y)^2),
E_f_hat = mean(.fitted),
var_f_hat=var(.fitted)*((n_samples-1)/n_samples),
fx=mean(fx),
var_eps=var(y),
squared_bias = (E_f_hat-fx)^2,
decomp_test_mse = squared_bias+var_f_hat+ var_eps,
diff_mse=round(exp_test_mse - decomp_test_mse,digits=3)
) %>% dplyr::select(squared_bias,var_f_hat,var_eps, exp_test_mse, decomp_test_mse, diff_mse)
Let’s now increase the model flexibility by fitting polynomial regression for an increasing degree
tune_synth_ys = synth_ys %>% mutate(degree=map(.x=sample_id,~1:10)) %>% unnest(cols = degree)
tuned_results = tune_synth_ys %>% mutate(
model = map2(.x = data,.y=degree,.f=~lm(data=.x, formula = y~poly(x,.y))),
model_preds= map2(.x = model,.y=test_data,.f=~augment(.x, newdata = .y))
)
test_mse_by_degree = tuned_results %>% unnest(cols = model_preds) %>% filter(test_ob_id==1) %>%
group_by(degree) %>%
summarise(var_f_hat=var(.fitted)*((n_samples-1)/n_samples),
E_f_hat = mean(.fitted),
fx=mean(fx),
squared_bias = ((E_f_hat-fx)^2),
exp_test_mse = (squared_bias+var_f_hat+true_sigma^2)
) %>% dplyr::select(degree,exp_test_mse,var_f_hat,squared_bias) %>%
rename(variance=var_f_hat,`test MSE`=exp_test_mse,`squared bias`=squared_bias)
test_mse_by_degree %>% slice_min(`test MSE`) %>%
kbl() %>% kable_styling(bootstrap_options = "hover")
| degree | test MSE | variance | squared bias |
|---|---|---|---|
| 2 | 25.57643 | 0.4405779 | 0.1358484 |
test_mse_by_degree %>% mutate(variance=variance,
`test MSE`=`test MSE`,
`squared bias`=`squared bias`) %>%
pivot_longer(names_to="source",values_to="error", cols = 2:4) %>%
ggplot(aes(x=degree,y=error,group=source,color=source))+
geom_smooth(se=FALSE)+
# geom_line()+
geom_hline(yintercept=(true_sigma^2)) +
labs(title="test MSE decomposition")